#!/bin/bash
SCRIPT_PATH=/share/projset/dsir7
CONFIG_FILE=/share/projset/dsir7/indexes/qwen_tokenizer_3_300000/feat_config.json
CODE_DIRECTORY=/share/projset/sft_for_code_math/code_data_plain
RAW_DATASETS=/share/projset/dsir7/data/code_data_plain.jsonl
# bash scripts/merged_jsonl.sh $CODE_DIRECTORY $RAW_DATASETS
TARGET_DATASETS=/share/projset/dsir7/data/humaneval+mbpp/humaneval-test.jsonl
NUM_TO_SAMPLE=600000
MIN_EXAMPLE_LENGTH=0
MERGED_DIR=/share/projset/dsir7/indexes/qwen_tokenizer_3_300000/code_data_plain_humaneval-test_0.0_0/resampled
OUTFILE=/share/projset/dsir7/indexes/qwen_tokenizer_3_300000/merged_humaneval-test.jsonl
bash scripts/search.sh $CONFIG_FILE $RAW_DATASETS $TARGET_DATASETS $NUM_TO_SAMPLE $MIN_EXAMPLE_LENGTH
bash scripts/merged_jsonl.sh $MERGED_DIR $OUTFILE